From b118dbcd15c94e42d5847c1bd16b0e84a9f53c97 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 27 Jul 2023 08:47:48 -0700 Subject: [PATCH 01/35] refactor mg api core function to match sg api --- .../simpleDistributedGraph.py | 105 +++++++++++------- .../graph_implementation/simpleGraph.py | 6 +- .../cugraph/cugraph/structure/number_map.py | 10 +- 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index c0efb425b75..50cab9c0dbb 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -216,10 +216,11 @@ def __from_edgelist( # The symmetrize step may add additional edges with unknown # ids and types for an undirected graph. Therefore, only # directed graphs may be used with ids and types. - if len(edge_attr) == 3: + # FIXME: Drop the check in symmetrize.py as it is redundant + if len(edge_attr) == 3 or edge_id is not None or edge_type is not None: if not self.properties.directed: raise ValueError( - "User-provided edge ids and edge " + "User-provided edge ids and/or edge " "types are not permitted for an " "undirected graph." ) @@ -318,7 +319,6 @@ def __from_edgelist( ddf = ddf.map_partitions(lambda df: df.copy()) ddf = persist_dask_df_equal_parts_per_worker(ddf, _client) num_edges = len(ddf) - self._number_of_edges = num_edges ddf = get_persisted_df_worker_map(ddf, _client) delayed_tasks_d = { w: delayed(simpleDistributedGraphImpl._make_plc_graph)( @@ -358,6 +358,8 @@ def renumbered(self): def view_edge_list(self): """ + FIXME: For undirected graph, is it trully returning the upper triangular + matrix of the symmetrized edgelist? Display the edge list. Compute it if needed. NOTE: If the graph is of type Graph() then the displayed undirected edges are the same as displayed by networkx Graph(), but the direction @@ -388,7 +390,34 @@ def view_edge_list(self): """ if self.edgelist is None: raise RuntimeError("Graph has no Edgelist.") - return self.edgelist.edgelist_df + + edgelist_df = self.input_df + is_string_dtype = False + if not self.properties.directed: + srcCol = self.source_columns + dstCol = self.destination_columns + if self.renumber_map.unrenumbered_id_type == "object": + # FIXME: Use the renumbered vertices instead and then un-renumber. + # This operation can be expensive. + is_string_dtype = True + edgelist_df = self.edgelist.edgelist_df + srcCol = self.renumber_map.renumbered_src_col_name + dstCol = self.renumber_map.renumbered_dst_col_name + + edgelist_df[srcCol], edgelist_df[dstCol] = \ + edgelist_df[[srcCol, dstCol]].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1) + + edgelist_df = edgelist_df.groupby(by=[srcCol, dstCol]).sum().reset_index() + wgtCol = simpleDistributedGraphImpl.edgeWeightCol + if wgtCol in edgelist_df.columns: + edgelist_df[wgtCol] /= 2 + + if is_string_dtype: + # unrenumber the vertices + edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) + edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) + self.properties.edge_count = len(edgelist_df) + return edgelist_df def delete_edge_list(self): """ @@ -407,23 +436,7 @@ def number_of_vertices(self): Get the number of nodes in the graph. """ if self.properties.node_count is None: - if self.edgelist is not None: - if self.renumbered is True: - src_col_name = self.renumber_map.renumbered_src_col_name - dst_col_name = self.renumber_map.renumbered_dst_col_name - # FIXME: from_dask_cudf_edgelist() currently requires - # renumber=True for MG, so this else block will not be - # used. Should this else block be removed and added back when - # the restriction is removed? - else: - src_col_name = "src" - dst_col_name = "dst" - - ddf = self.edgelist.edgelist_df[[src_col_name, dst_col_name]] - # ddf = self.edgelist.edgelist_df[["src", "dst"]] - self.properties.node_count = ddf.max().max().compute() + 1 - else: - raise RuntimeError("Graph is Empty") + self.properties.node_count = len(self.nodes()) return self.properties.node_count def number_of_nodes(self): @@ -436,10 +449,17 @@ def number_of_edges(self, directed_edges=False): """ Get the number of edges in the graph. """ - if self.edgelist is not None: - return self._number_of_edges - else: - raise RuntimeError("Graph is Empty") + + if directed_edges and self.edgelist is not None: + return len(self.edgelist.edgelist_df) + # FIXME: set 'self.properties.edge_count' when viewing the edgelist + # Update the view of edgelist for undirected graph in MG here (return only lower) + if self.properties.edge_count is None: + if self.edgelist is not None: + self.view_edge_list() + else: + raise RuntimeError("Graph is Empty") + return self.properties.edge_count def in_degree(self, vertex_subset=None): """ @@ -1031,10 +1051,9 @@ def edges(self): # used. Should this else block be removed and added back when # the restriction is removed? else: - src_col_name = "src" - dst_col_name = "dst" + src_col_name = self.source_columns + dst_col_name = self.destination_columns - # return self.view_edge_list()[["src", "dst"]] return self.view_edge_list()[[src_col_name, dst_col_name]] def nodes(self): @@ -1047,23 +1066,27 @@ def nodes(self): a dataframe and do 'renumber_map.unrenumber' or 'G.unrenumber' """ - if self.renumbered: - # FIXME: This relies on current implementation - # of NumberMap, should not really expose - # this, perhaps add a method to NumberMap + if self.edgelist is not None: + if self.renumbered: + # FIXME: This relies on current implementation + # of NumberMap, should not really expose + # this, perhaps add a method to NumberMap - df = self.renumber_map.implementation.ddf.drop(columns="global_id") + df = self.renumber_map.implementation.ddf.drop(columns="global_id") - if len(df.columns) > 1: - return df - else: - return df[df.columns[0]] + if len(df.columns) > 1: + return df + else: + return df[df.columns[0]] + else: + df = self.input_df + return dask_cudf.concat( + [df[self.source_columns], df[self.destination_columns]] + ).drop_duplicates() else: - df = self.input_df - return dask_cudf.concat( - [df[self.source_columns], df[self.destination_columns]] - ).drop_duplicates() + raise RuntimeError("Graph is Empty") + def neighbors(self, n): if self.edgelist is None: diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index d0c0ded5eb4..dd6d416327a 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -370,6 +370,8 @@ def to_numpy_matrix(self): def view_edge_list(self): """ + FIXME: For undirected graph, is it trully returning the upper triangular + matrix of the symmetrized edgelist? Display the edge list. Compute it if needed. NOTE: If the graph is of type Graph() then the displayed undirected edges are the same as displayed by networkx Graph(), but the direction @@ -407,6 +409,7 @@ def view_edge_list(self): edgelist_df = self.edgelist.edgelist_df + # FIXME: No need to un-renumber as it is expensive. if self.properties.renumbered: edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.srcCol @@ -422,7 +425,8 @@ def view_edge_list(self): <= edgelist_df[simpleGraphImpl.dstCol] ] edgelist_df = edgelist_df.reset_index(drop=True) - self.properties.edge_count = len(edgelist_df) + + self.properties.edge_count = len(edgelist_df) return edgelist_df diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 481f99b9060..85a2d50e816 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -480,7 +480,15 @@ def renumber_and_segment( # For columns with mismatch dtypes, set the renumbered # id_type to either 'int32' or 'int64' - if df.dtypes.nunique() > 1: + if isinstance(src_col_names, list): + vertex_col_names = src_col_names + else: + vertex_col_names = [src_col_names] + if isinstance(dst_col_names, list): + vertex_col_names += dst_col_names + else: + vertex_col_names += [dst_col_names] + if df[vertex_col_names].dtypes.nunique() > 1: # can't determine the edgelist input type unrenumbered_id_type = None else: From 5e1bb6acf4b3ce0a9fb27575fcd1d743447a0053 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 28 Jul 2023 09:59:57 -0700 Subject: [PATCH 02/35] refactor functions, add new attributes and fixmes --- .../simpleDistributedGraph.py | 34 +++++++++------- .../graph_implementation/simpleGraph.py | 40 ++++++++++++++----- .../cugraph/cugraph/structure/number_map.py | 23 ++++++++++- 3 files changed, 70 insertions(+), 27 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index 29241df1fdf..3b6e0b9d38c 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -79,6 +79,8 @@ def __init__(self, properties): self.properties = simpleDistributedGraphImpl.Properties(properties) self.source_columns = None self.destination_columns = None + self.weight_column = None + self.vertex_columns = None def _make_plc_graph( sID, @@ -175,6 +177,7 @@ def __from_edgelist( "and destination parameters" ) ddf_columns = s_col + d_col + self.vertex_columns = ddf_columns.copy() _client = default_client() workers = _client.scheduler_info()["workers"] # Repartition to 2 partitions per GPU for memory efficient process @@ -215,7 +218,7 @@ def __from_edgelist( # ids and types for an undirected graph. Therefore, only # directed graphs may be used with ids and types. # FIXME: Drop the check in symmetrize.py as it is redundant - if len(edge_attr) == 3 or edge_id is not None or edge_type is not None: + if len(edge_attr) == 3: if not self.properties.directed: raise ValueError( "User-provided edge ids and/or edge " @@ -286,6 +289,7 @@ def __from_edgelist( self.properties.renumber = renumber self.source_columns = source self.destination_columns = destination + self.weight_column = weight # If renumbering is not enabled, this function will only create # the edgelist_df and not do any renumbering. @@ -356,8 +360,8 @@ def renumbered(self): def view_edge_list(self): """ - FIXME: For undirected graph, is it trully returning the upper triangular - matrix of the symmetrized edgelist? + FIXME: Should this also return the edge ids and types? + Display the edge list. Compute it if needed. NOTE: If the graph is of type Graph() then the displayed undirected edges are the same as displayed by networkx Graph(), but the direction @@ -408,12 +412,23 @@ def view_edge_list(self): edgelist_df = edgelist_df.groupby(by=[srcCol, dstCol]).sum().reset_index() wgtCol = simpleDistributedGraphImpl.edgeWeightCol if wgtCol in edgelist_df.columns: + # FIXME: This breaks if there are are multi edges as those will + # be dropped during the symmetrization step and the original 'weight' + # will be halved. edgelist_df[wgtCol] /= 2 + if is_string_dtype: # unrenumber the vertices edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) + + edgelist_df = edgelist_df.rename( + columns=self.renumber_map.internal_to_external_col_names) + + # If there is no 'wgt' column, nothing will happen + edgelist_df = edgelist_df.rename(columns={wgtCol: self.weight_column}) + self.properties.edge_count = len(edgelist_df) return edgelist_df @@ -1041,18 +1056,9 @@ def edges(self): sources and destinations. It does not return the edge weights. For viewing edges with weights use view_edge_list() """ - if self.renumbered is True: - src_col_name = self.renumber_map.renumbered_src_col_name - dst_col_name = self.renumber_map.renumbered_dst_col_name - # FIXME: from_dask_cudf_edgelist() currently requires - # renumber=True for MG, so this else block will not be - # used. Should this else block be removed and added back when - # the restriction is removed? - else: - src_col_name = self.source_columns - dst_col_name = self.destination_columns + + return self.view_edge_list()[self.vertex_columns] - return self.view_edge_list()[[src_col_name, dst_col_name]] def nodes(self): """ diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index dd6d416327a..2579a3a35d2 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -109,6 +109,12 @@ def __init__(self, properties): self.batch_adjlists = None self.batch_transposed_adjlists = None + self.source_columns = None + self.detination_columns = None + self.vertex_columns = None + self.weight_column = None + + # Functions # FIXME: Change to public function # FIXME: Make function more modular @@ -149,6 +155,7 @@ def __from_edgelist( "destination parameters" ) df_columns = s_col + d_col + self.vertex_columns = df_columns.copy() if edge_attr is not None: if weight is not None or edge_id is not None or edge_type is not None: @@ -215,6 +222,9 @@ def __from_edgelist( # Original, unmodified input dataframe. self.input_df = elist + self.weight_column = weight + self.source_columns = source + self.destination_columns = source # Renumbering self.renumber_map = None @@ -370,8 +380,6 @@ def to_numpy_matrix(self): def view_edge_list(self): """ - FIXME: For undirected graph, is it trully returning the upper triangular - matrix of the symmetrized edgelist? Display the edge list. Compute it if needed. NOTE: If the graph is of type Graph() then the displayed undirected edges are the same as displayed by networkx Graph(), but the direction @@ -409,6 +417,16 @@ def view_edge_list(self): edgelist_df = self.edgelist.edgelist_df + # FIXME: When renumbered, the MG API uses renumbered col names which + # is not consistant with the SG API. + if not self.properties.directed: + # Extract the upper triangular matrix from the renumebred edges + edgelist_df = edgelist_df[ + edgelist_df[simpleGraphImpl.srcCol] + <= edgelist_df[simpleGraphImpl.dstCol] + ] + edgelist_df = edgelist_df.reset_index(drop=True) + # FIXME: No need to un-renumber as it is expensive. if self.properties.renumbered: edgelist_df = self.renumber_map.unrenumber( @@ -417,17 +435,16 @@ def view_edge_list(self): edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.dstCol ) - - # FIXME: revisit this approach - if not self.properties.directed: - edgelist_df = edgelist_df[ - edgelist_df[simpleGraphImpl.srcCol] - <= edgelist_df[simpleGraphImpl.dstCol] - ] - edgelist_df = edgelist_df.reset_index(drop=True) self.properties.edge_count = len(edgelist_df) + edgelist_df = edgelist_df.rename( + columns=self.renumber_map.internal_to_external_col_names) + + # If there is no 'wgt' column, nothing will happen + wgtCol = simpleGraphImpl.edgeWeightCol + edgelist_df = edgelist_df.rename(columns={wgtCol: self.weight_column}) + return edgelist_df def delete_edge_list(self): @@ -1179,7 +1196,8 @@ def edges(self): sources and destinations. It does not return the edge weights. For viewing edges with weights use view_edge_list() """ - return self.view_edge_list()[[simpleGraphImpl.srcCol, simpleGraphImpl.dstCol]] + # FIXME: The user needs to get the edges with original column names + return self.view_edge_list()[self.vertex_columns] def nodes(self): """ diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 85a2d50e816..7e90069bfe8 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -258,6 +258,7 @@ def __init__( # The column name 'id' contains the renumbered vertices and the other column(s) # contain the original vertices self.df_internal_to_external = None + self.internal_to_external_col_names = {} @staticmethod def compute_vals_types(df, column_names): @@ -481,7 +482,7 @@ def renumber_and_segment( # For columns with mismatch dtypes, set the renumbered # id_type to either 'int32' or 'int64' if isinstance(src_col_names, list): - vertex_col_names = src_col_names + vertex_col_names = src_col_names.copy() else: vertex_col_names = [src_col_names] if isinstance(dst_col_names, list): @@ -511,7 +512,16 @@ def renumber_and_segment( renumbered = True renumber_map = NumberMap(renumber_id_type, unrenumbered_id_type, renumbered) - if not isinstance(src_col_names, list): + renumber_map.input_src_col_names = src_col_names + renumber_map.input_dst_col_names = dst_col_names + if not isinstance(renumber_map.input_src_col_names, list): + """ + FIXME: Add mapping for multicolumn vertices. + renumber_map.internal_to_external_col_names = { + renumber_map.renumbered_src_col_name: renumber_map.input_src_col_names, + renumber_map.renumbered_dst_col_name: renumber_map.input_dst_col_names + } + """ src_col_names = [src_col_names] dst_col_names = [dst_col_names] @@ -655,6 +665,14 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal for nm in self.implementation.col_names: mapping[nm] = nm + "_" + column_name col_names = list(mapping.values()) + # FIXME: instead of hardcoded value, it should be 'simpleGraphImpl.srcCol' + # but there is no way to retrieve it with the current API + if column_name in [self.renumbered_src_col_name, "src"]: + self.internal_to_external_col_names.update(dict(zip( + col_names, self.input_src_col_names))) + elif column_name in [self.renumbered_dst_col_name, "dst"]: + self.internal_to_external_col_names.update(dict(zip( + col_names, self.input_dst_col_names))) if preserve_order: index_name = NumberMap.generate_unused_column_name(df) @@ -673,6 +691,7 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal df = df.map_partitions(lambda df: df.rename(columns=mapping, copy=False)) else: df = df.rename(columns=mapping, copy=False) + # FIXME: This parameter is not working as expected if get_column_names: return df, col_names else: From bd8c978f3eb090044a457c83238421312c360d4e Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 28 Jul 2023 10:00:36 -0700 Subject: [PATCH 03/35] add fixme --- python/cugraph/cugraph/structure/symmetrize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cugraph/cugraph/structure/symmetrize.py b/python/cugraph/cugraph/structure/symmetrize.py index 15011fa8dbc..4c00e68344d 100644 --- a/python/cugraph/cugraph/structure/symmetrize.py +++ b/python/cugraph/cugraph/structure/symmetrize.py @@ -230,6 +230,7 @@ def symmetrize( """ + # FIXME: Redundant check that should be done at the graph creation if "edge_id" in input_df.columns and symmetrize: raise ValueError("Edge IDs are not supported on undirected graphs") From 2f91b5f05336c2572ee18d507068166483142aa7 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 28 Jul 2023 10:00:58 -0700 Subject: [PATCH 04/35] remove expensive test --- .../tests/centrality/test_edge_betweenness_centrality_mg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py index aa41f8e1c82..f10e2029380 100644 --- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py +++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py @@ -41,7 +41,7 @@ def setup_function(): # email_Eu_core is too expensive to test -datasets = DATASETS_UNDIRECTED + [email_Eu_core] +datasets = DATASETS_UNDIRECTED # ============================================================================= From 82dd74e6c4bdd2dad804e49f1459d842aa565790 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 28 Jul 2023 10:42:19 -0700 Subject: [PATCH 05/35] fix style --- .../simpleDistributedGraph.py | 22 +++++++++---------- .../graph_implementation/simpleGraph.py | 8 +++---- .../cugraph/cugraph/structure/number_map.py | 10 +++++---- .../test_edge_betweenness_centrality_mg.py | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index 3b6e0b9d38c..d9d6f0e399c 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -177,7 +177,7 @@ def __from_edgelist( "and destination parameters" ) ddf_columns = s_col + d_col - self.vertex_columns = ddf_columns.copy() + self.vertex_columns = ddf_columns.copy() _client = default_client() workers = _client.scheduler_info()["workers"] # Repartition to 2 partitions per GPU for memory efficient process @@ -392,7 +392,7 @@ def view_edge_list(self): """ if self.edgelist is None: raise RuntimeError("Graph has no Edgelist.") - + edgelist_df = self.input_df is_string_dtype = False if not self.properties.directed: @@ -406,8 +406,9 @@ def view_edge_list(self): srcCol = self.renumber_map.renumbered_src_col_name dstCol = self.renumber_map.renumbered_dst_col_name - edgelist_df[srcCol], edgelist_df[dstCol] = \ - edgelist_df[[srcCol, dstCol]].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1) + edgelist_df[srcCol], edgelist_df[dstCol] = edgelist_df[ + [srcCol, dstCol] + ].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1) edgelist_df = edgelist_df.groupby(by=[srcCol, dstCol]).sum().reset_index() wgtCol = simpleDistributedGraphImpl.edgeWeightCol @@ -417,15 +418,15 @@ def view_edge_list(self): # will be halved. edgelist_df[wgtCol] /= 2 - if is_string_dtype: # unrenumber the vertices edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) edgelist_df = edgelist_df.rename( - columns=self.renumber_map.internal_to_external_col_names) - + columns=self.renumber_map.internal_to_external_col_names + ) + # If there is no 'wgt' column, nothing will happen edgelist_df = edgelist_df.rename(columns={wgtCol: self.weight_column}) @@ -465,8 +466,7 @@ def number_of_edges(self, directed_edges=False): if directed_edges and self.edgelist is not None: return len(self.edgelist.edgelist_df) - # FIXME: set 'self.properties.edge_count' when viewing the edgelist - # Update the view of edgelist for undirected graph in MG here (return only lower) + if self.properties.edge_count is None: if self.edgelist is not None: self.view_edge_list() @@ -1056,9 +1056,8 @@ def edges(self): sources and destinations. It does not return the edge weights. For viewing edges with weights use view_edge_list() """ - - return self.view_edge_list()[self.vertex_columns] + return self.view_edge_list()[self.vertex_columns] def nodes(self): """ @@ -1091,7 +1090,6 @@ def nodes(self): else: raise RuntimeError("Graph is Empty") - def neighbors(self, n): if self.edgelist is None: raise RuntimeError("Graph has no Edgelist.") diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 2579a3a35d2..45025b87144 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -113,7 +113,6 @@ def __init__(self, properties): self.detination_columns = None self.vertex_columns = None self.weight_column = None - # Functions # FIXME: Change to public function @@ -435,12 +434,13 @@ def view_edge_list(self): edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.dstCol ) - + self.properties.edge_count = len(edgelist_df) edgelist_df = edgelist_df.rename( - columns=self.renumber_map.internal_to_external_col_names) - + columns=self.renumber_map.internal_to_external_col_names + ) + # If there is no 'wgt' column, nothing will happen wgtCol = simpleGraphImpl.edgeWeightCol edgelist_df = edgelist_df.rename(columns={wgtCol: self.weight_column}) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 7e90069bfe8..91b13c60490 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -668,11 +668,13 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal # FIXME: instead of hardcoded value, it should be 'simpleGraphImpl.srcCol' # but there is no way to retrieve it with the current API if column_name in [self.renumbered_src_col_name, "src"]: - self.internal_to_external_col_names.update(dict(zip( - col_names, self.input_src_col_names))) + self.internal_to_external_col_names.update( + dict(zip(col_names, self.input_src_col_names)) + ) elif column_name in [self.renumbered_dst_col_name, "dst"]: - self.internal_to_external_col_names.update(dict(zip( - col_names, self.input_dst_col_names))) + self.internal_to_external_col_names.update( + dict(zip(col_names, self.input_dst_col_names)) + ) if preserve_order: index_name = NumberMap.generate_unused_column_name(df) diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py index f10e2029380..97e503e5428 100644 --- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py +++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py @@ -16,7 +16,7 @@ import dask_cudf from pylibcugraph.testing.utils import gen_fixture_params_product -from cugraph.experimental.datasets import DATASETS_UNDIRECTED, email_Eu_core +from cugraph.experimental.datasets import DATASETS_UNDIRECTED import cugraph import cugraph.dask as dcg From 4f03d88be266389f5ea416cf2d1245ab3f7c5a9f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 31 Jul 2023 13:48:11 -0700 Subject: [PATCH 06/35] retrieve weights if they were passed --- python/pylibcugraph/pylibcugraph/graphs.pxd | 1 + python/pylibcugraph/pylibcugraph/graphs.pyx | 17 ++++++++--------- .../pylibcugraph/uniform_random_walks.pyx | 11 ++++++++--- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pxd b/python/pylibcugraph/pylibcugraph/graphs.pxd index 4e52ed557ed..a2df44ba26e 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pxd +++ b/python/pylibcugraph/pylibcugraph/graphs.pxd @@ -25,6 +25,7 @@ from pylibcugraph._cugraph_c.graph cimport ( cdef class _GPUGraph: cdef cugraph_graph_t* c_graph_ptr cdef cugraph_type_erased_device_array_view_t* edge_id_view_ptr + cdef cugraph_type_erased_device_array_view_t* weights_view_ptr cdef class SGGraph(_GPUGraph): pass diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index fb4692bf3a8..33a8a09c6f4 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -166,11 +166,10 @@ cdef class SGGraph(_GPUGraph): dst_or_index_array ) - cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ - create_cugraph_type_erased_device_array_view_from_py_obj( + + self.weights_view_ptr = create_cugraph_type_erased_device_array_view_from_py_obj( weight_array ) - self.edge_id_view_ptr = create_cugraph_type_erased_device_array_view_from_py_obj( edge_id_array @@ -187,7 +186,7 @@ cdef class SGGraph(_GPUGraph): &(graph_properties.c_graph_properties), srcs_or_offsets_view_ptr, dsts_or_indices_view_ptr, - weights_view_ptr, + self.weights_view_ptr, self.edge_id_view_ptr, edge_type_view_ptr, store_transposed, @@ -205,7 +204,7 @@ cdef class SGGraph(_GPUGraph): &(graph_properties.c_graph_properties), srcs_or_offsets_view_ptr, dsts_or_indices_view_ptr, - weights_view_ptr, + self.weights_view_ptr, self.edge_id_view_ptr, edge_type_view_ptr, store_transposed, @@ -224,7 +223,7 @@ cdef class SGGraph(_GPUGraph): cugraph_type_erased_device_array_view_free(srcs_or_offsets_view_ptr) cugraph_type_erased_device_array_view_free(dsts_or_indices_view_ptr) - cugraph_type_erased_device_array_view_free(weights_view_ptr) + cugraph_type_erased_device_array_view_free(self.weights_view_ptr) if self.edge_id_view_ptr is not NULL: cugraph_type_erased_device_array_view_free(self.edge_id_view_ptr) if edge_type_view_ptr is not NULL: @@ -337,7 +336,7 @@ cdef class MGGraph(_GPUGraph): create_cugraph_type_erased_device_array_view_from_py_obj( dst_array ) - cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ + self.weights_view_ptr = \ create_cugraph_type_erased_device_array_view_from_py_obj( weight_array ) @@ -355,7 +354,7 @@ cdef class MGGraph(_GPUGraph): &(graph_properties.c_graph_properties), srcs_view_ptr, dsts_view_ptr, - weights_view_ptr, + self.weights_view_ptr, self.edge_id_view_ptr, edge_type_view_ptr, store_transposed, @@ -369,7 +368,7 @@ cdef class MGGraph(_GPUGraph): cugraph_type_erased_device_array_view_free(srcs_view_ptr) cugraph_type_erased_device_array_view_free(dsts_view_ptr) - cugraph_type_erased_device_array_view_free(weights_view_ptr) + cugraph_type_erased_device_array_view_free(self.weights_view_ptr) if self.edge_id_view_ptr is not NULL: cugraph_type_erased_device_array_view_free(self.edge_id_view_ptr) if edge_type_view_ptr is not NULL: diff --git a/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx b/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx index 4a2b8a70189..1570523beb8 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx @@ -96,6 +96,8 @@ def uniform_random_walks(ResourceHandle resource_handle, cdef uintptr_t cai_start_ptr = \ start_vertices.__cuda_array_interface__["data"][0] + cdef cugraph_type_erased_device_array_view_t* weights_ptr + cdef cugraph_type_erased_device_array_view_t* start_ptr = \ cugraph_type_erased_device_array_view_create( cai_start_ptr, @@ -113,14 +115,17 @@ def uniform_random_walks(ResourceHandle resource_handle, cdef cugraph_type_erased_device_array_view_t* path_ptr = \ cugraph_random_walk_result_get_paths(result_ptr) - cdef cugraph_type_erased_device_array_view_t* weights_ptr = \ - cugraph_random_walk_result_get_weights(result_ptr) + + if input_graph.weights_view_ptr is NULL: + cupy_weights = None + else: + weights_ptr = cugraph_random_walk_result_get_weights(result_ptr) + cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) max_path_length = \ cugraph_random_walk_result_get_max_path_length(result_ptr) cupy_paths = copy_to_cupy_array(c_resource_handle_ptr, path_ptr) - cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) cugraph_random_walk_result_free(result_ptr) cugraph_type_erased_device_array_view_free(start_ptr) From 8a9453bc6d09b805315a3569f95f3521a6e3f3a0 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 31 Jul 2023 14:18:37 -0700 Subject: [PATCH 07/35] update tests --- .../centrality/test_betweenness_centrality.py | 2 +- .../tests/community/test_balanced_cut.py | 4 +- .../tests/community/test_k_truss_subgraph.py | 8 ++-- .../tests/link_analysis/test_pagerank.py | 4 +- .../cugraph/tests/nx/test_nx_convert.py | 2 +- .../tests/sampling/test_random_walks.py | 41 +++++++++++-------- 6 files changed, 34 insertions(+), 27 deletions(-) diff --git a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py index 3e4dd3af4fc..db34c68a054 100644 --- a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py +++ b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py @@ -118,7 +118,7 @@ def calc_betweenness_centrality( ) M = G.to_pandas_edgelist().rename( - columns={"src": "0", "dst": "1", "weights": "weight"} + columns={"src": "0", "dst": "1", "wgt": edge_attr} ) Gnx = nx.from_pandas_edgelist( diff --git a/python/cugraph/cugraph/tests/community/test_balanced_cut.py b/python/cugraph/cugraph/tests/community/test_balanced_cut.py index 0a95a1846ce..f6c1a741011 100644 --- a/python/cugraph/cugraph/tests/community/test_balanced_cut.py +++ b/python/cugraph/cugraph/tests/community/test_balanced_cut.py @@ -102,7 +102,7 @@ def test_edge_cut_clustering_with_edgevals(graph_file, partitions): @pytest.mark.sg -@pytest.mark.parametrize("graph_file", [DEFAULT_DATASETS[2]]) +@pytest.mark.parametrize("graph_file", DEFAULT_DATASETS) @pytest.mark.parametrize("partitions", PARTITIONS) def test_edge_cut_clustering_with_edgevals_nx(graph_file, partitions): gc.collect() @@ -111,7 +111,7 @@ def test_edge_cut_clustering_with_edgevals_nx(graph_file, partitions): # read_weights_in_sp=True => value column dtype is float32 G = graph_file.get_graph() NM = G.to_pandas_edgelist().rename( - columns={"src": "0", "dst": "1", "weights": "weight"} + columns={"src": "0", "dst": "1", "wgt": "weight"} ) G = nx.from_pandas_edgelist( diff --git a/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py b/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py index b0dcc2ede3d..c1f8f4c3546 100644 --- a/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py +++ b/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py @@ -39,7 +39,7 @@ def setup_function(): # currently in networkx master and will hopefully will make it to a release # soon. def ktruss_ground_truth(graph_file): - G = nx.read_edgelist(str(graph_file), nodetype=int, data=(("weights", float),)) + G = nx.read_edgelist(str(graph_file), nodetype=int, data=(("weight", float),)) df = nx.to_pandas_edgelist(G) return df @@ -50,18 +50,18 @@ def compare_k_truss(k_truss_cugraph, k, ground_truth_file): edgelist_df = k_truss_cugraph.view_edge_list() src = edgelist_df["src"] dst = edgelist_df["dst"] - wgt = edgelist_df["weights"] + wgt = edgelist_df["weight"] assert len(edgelist_df) == len(k_truss_nx) for i in range(len(src)): has_edge = ( (k_truss_nx["source"] == src[i]) & (k_truss_nx["target"] == dst[i]) - & np.isclose(k_truss_nx["weights"], wgt[i]) + & np.isclose(k_truss_nx["weight"], wgt[i]) ).any() has_opp_edge = ( (k_truss_nx["source"] == dst[i]) & (k_truss_nx["target"] == src[i]) - & np.isclose(k_truss_nx["weights"], wgt[i]) + & np.isclose(k_truss_nx["weight"], wgt[i]) ).any() assert has_edge or has_opp_edge return True diff --git a/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py b/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py index 8e8ab13574d..9d9572b88b2 100644 --- a/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py +++ b/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py @@ -187,11 +187,11 @@ def test_pagerank( G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) if has_precomputed_vertex_out_weight == 1: - df = G.view_edge_list()[["src", "weights"]] + df = G.view_edge_list()[["src", "wgt"]] pre_vtx_o_wgt = ( df.groupby(["src"], as_index=False) .sum() - .rename(columns={"src": "vertex", "weights": "sums"}) + .rename(columns={"src": "vertex", "wgt": "sums"}) ) cugraph_pr = cugraph_call( diff --git a/python/cugraph/cugraph/tests/nx/test_nx_convert.py b/python/cugraph/cugraph/tests/nx/test_nx_convert.py index 58b89a4bda9..6455c611619 100644 --- a/python/cugraph/cugraph/tests/nx/test_nx_convert.py +++ b/python/cugraph/cugraph/tests/nx/test_nx_convert.py @@ -26,7 +26,7 @@ def _compare_graphs(nxG, cuG, has_wt=True): cu_df = cuG.view_edge_list().to_pandas() if has_wt is True: - cu_df = cu_df.drop(columns=["weights"]) + cu_df = cu_df.drop(columns=["weight"]) out_of_order = cu_df[cu_df["src"] > cu_df["dst"]] if len(out_of_order) > 0: diff --git a/python/cugraph/cugraph/tests/sampling/test_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_random_walks.py index 48629fa03a6..e67e1752ef9 100644 --- a/python/cugraph/cugraph/tests/sampling/test_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_random_walks.py @@ -76,7 +76,7 @@ def calc_random_walks(G, max_depth=None, use_padding=False, legacy_result_type=T """ assert G is not None - G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="weights") + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") k = random.randint(1, 6) @@ -136,8 +136,9 @@ def check_random_walks_padded(G, path_data, seeds, max_depth, legacy_result_type e_wgt_paths = path_data[1] e_wgt_idx = 0 - G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="weights") + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") df_G = G.input_df + if "weight" in df_G.columns: df_G = df_G.rename(columns={"weight": "wgt"}) @@ -176,17 +177,18 @@ def check_random_walks_padded(G, path_data, seeds, max_depth, legacy_result_type else: # check valid edge wgt - expected_wgt = edge["wgt"].iloc[0] - result_wgt = e_wgt_paths.iloc[e_wgt_idx] - - if expected_wgt != result_wgt: - print( - "[ERR] Invalid edge wgt: " - "The edge src {} dst {} has wgt {} but got {}".format( - src, dst, expected_wgt, result_wgt + if G.is_weighted(): + expected_wgt = edge["wgt"].iloc[0] + result_wgt = e_wgt_paths.iloc[e_wgt_idx] + + if expected_wgt != result_wgt: + print( + "[ERR] Invalid edge wgt: " + "The edge src {} dst {} has wgt {} but got {}".format( + src, dst, expected_wgt, result_wgt + ) ) - ) - invalid_edge_wgt += 1 + invalid_edge_wgt += 1 e_wgt_idx += 1 if src != -1 and dst == -1: @@ -195,9 +197,10 @@ def check_random_walks_padded(G, path_data, seeds, max_depth, legacy_result_type assert invalid_seeds == 0 assert invalid_edge == 0 - assert invalid_edge_wgt == 0 assert len(v_paths) == (max_depth) * len(seeds) - assert len(e_wgt_paths) == (max_depth - 1) * len(seeds) + if G.is_weighted(): + assert invalid_edge_wgt == 0 + assert len(e_wgt_paths) == (max_depth - 1) * len(seeds) if legacy_result_type: sizes = path_data[2] @@ -298,11 +301,15 @@ def test_random_walks_nx(graph_file): M = G.to_pandas_edgelist() + source = G.source_columns + target = G.destination_columns + edge_attr = G.weight_column + Gnx = nx.from_pandas_edgelist( M, - source="src", - target="dst", - edge_attr="weights", + source=source, + target=target, + edge_attr=edge_attr, create_using=nx.DiGraph(), ) max_depth = random.randint(2, 10) From 9c349b46fdfe79d21d536633c2e887245fa50b90 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 31 Jul 2023 14:19:07 -0700 Subject: [PATCH 08/35] fix typo --- .../cugraph/structure/graph_implementation/simpleGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 45025b87144..128f84e430b 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -223,7 +223,7 @@ def __from_edgelist( self.input_df = elist self.weight_column = weight self.source_columns = source - self.destination_columns = source + self.destination_columns = destination # Renumbering self.renumber_map = None From fdb98e744cc0a5689650984176282c095fd4fb55 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 31 Jul 2023 14:19:43 -0700 Subject: [PATCH 09/35] remove hardcoded column name --- python/cugraph/cugraph/utilities/nx_factory.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/utilities/nx_factory.py b/python/cugraph/cugraph/utilities/nx_factory.py index 2448a511229..d07d17978d7 100644 --- a/python/cugraph/cugraph/utilities/nx_factory.py +++ b/python/cugraph/cugraph/utilities/nx_factory.py @@ -236,11 +236,15 @@ def cugraph_to_nx(G): pdf = G.view_edge_list().to_pandas() num_col = len(pdf.columns) + source = G.source_columns + target = G.destination_columns + if num_col == 2: - Gnx = nx.from_pandas_edgelist(pdf, source="src", target="dst") + Gnx = nx.from_pandas_edgelist(pdf, source=source, target=target) else: + edge_attr = G.weight_column Gnx = nx.from_pandas_edgelist( - pdf, source="src", target="dst", edge_attr="weights" + pdf, source=source, target=target, edge_attr=edge_attr ) return Gnx From 4a25d00e8fcc365dc82a035094373e674d9ebfce Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 31 Jul 2023 14:21:02 -0700 Subject: [PATCH 10/35] fix style --- python/cugraph/cugraph/tests/sampling/test_random_walks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/sampling/test_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_random_walks.py index e67e1752ef9..9c94e036683 100644 --- a/python/cugraph/cugraph/tests/sampling/test_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_random_walks.py @@ -138,7 +138,7 @@ def check_random_walks_padded(G, path_data, seeds, max_depth, legacy_result_type G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") df_G = G.input_df - + if "weight" in df_G.columns: df_G = df_G.rename(columns={"weight": "wgt"}) From e335d2b34ee6e1b30a30c5fbc2ef66774d412d1f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 1 Aug 2023 08:34:53 -0700 Subject: [PATCH 11/35] remove (invalid) hardcoded column names --- .../cugraph/structure/graph_classes.py | 6 ++- .../simpleDistributedGraph.py | 7 +-- .../graph_implementation/simpleGraph.py | 48 ++++++++++++------- .../cugraph/cugraph/structure/number_map.py | 40 +++++++++++----- python/cugraph/cugraph/testing/utils.py | 2 +- .../cugraph/tests/structure/test_graph.py | 25 +++++----- .../tests/structure/test_multigraph.py | 2 +- 7 files changed, 82 insertions(+), 48 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_classes.py b/python/cugraph/cugraph/structure/graph_classes.py index b89ada9bf50..622352a7d50 100644 --- a/python/cugraph/cugraph/structure/graph_classes.py +++ b/python/cugraph/cugraph/structure/graph_classes.py @@ -68,11 +68,13 @@ def __init__(self, m_graph=None, directed=False): if isinstance(m_graph, MultiGraph): elist = m_graph.view_edge_list() if m_graph.is_weighted(): - weights = "weights" + weights = m_graph.weight_column else: weights = None self.from_cudf_edgelist( - elist, source="src", destination="dst", edge_attr=weights + elist, source=m_graph.source_columns, + destination=m_graph.destination_columns, + edge_attr=weights ) else: raise TypeError( diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index d9d6f0e399c..2d8b652a071 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -423,9 +423,10 @@ def view_edge_list(self): edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) - edgelist_df = edgelist_df.rename( - columns=self.renumber_map.internal_to_external_col_names - ) + if self.properties.renumbered: + edgelist_df = edgelist_df.rename( + columns=self.renumber_map.internal_to_external_col_names + ) # If there is no 'wgt' column, nothing will happen edgelist_df = edgelist_df.rename(columns={wgtCol: self.weight_column}) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 128f84e430b..2769b3a9c67 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -96,6 +96,7 @@ def __init__(self, properties): def __init__(self, properties): # Structure self.edgelist = None + self.input_df = None self.adjlist = None self.transposedadjlist = None self.renumber_map = None @@ -110,7 +111,7 @@ def __init__(self, properties): self.batch_transposed_adjlists = None self.source_columns = None - self.detination_columns = None + self.destination_columns = None self.vertex_columns = None self.weight_column = None @@ -218,13 +219,12 @@ def __from_edgelist( elist = input_df.compute().reset_index(drop=True) else: raise TypeError("input should be a cudf.DataFrame or a dask_cudf dataFrame") - # Original, unmodified input dataframe. self.input_df = elist self.weight_column = weight self.source_columns = source self.destination_columns = destination - + # Renumbering self.renumber_map = None self.store_transposed = store_transposed @@ -415,18 +415,14 @@ def view_edge_list(self): self.edgelist = self.EdgeList(src, dst, weights) edgelist_df = self.edgelist.edgelist_df + srcCol = self.source_columns + dstCol = self.destination_columns - # FIXME: When renumbered, the MG API uses renumbered col names which - # is not consistant with the SG API. - if not self.properties.directed: - # Extract the upper triangular matrix from the renumebred edges - edgelist_df = edgelist_df[ - edgelist_df[simpleGraphImpl.srcCol] - <= edgelist_df[simpleGraphImpl.dstCol] - ] - edgelist_df = edgelist_df.reset_index(drop=True) + if isinstance(srcCol, list) and len(srcCol) == 1: + srcCol = srcCol[0] + dstCol = dstCol[0] - # FIXME: No need to un-renumber as it is expensive. + # FIXME: Need to un-renumber if vertices are non integer or multi column if self.properties.renumbered: edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.srcCol @@ -434,13 +430,30 @@ def view_edge_list(self): edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.dstCol ) + print("the mapping = ", self.renumber_map.internal_to_external_col_names) + edgelist_df = edgelist_df.rename( + columns=self.renumber_map.internal_to_external_col_names + ) + else: + # When the graph is created from adjacency list, 'self.input_df', + # 'self.source_columns' and 'self.destination_columns' are Nont + if self.input_df is not None: + edgelist_df = self.input_df + if srcCol is None and dstCol is None: + srcCol = simpleGraphImpl.srcCol + dstCol = simpleGraphImpl.dstCol + # FIXME: When renumbered, the MG API uses renumbered col names which + # is not consistant with the SG API. + if not self.properties.directed: + # Extract the upper triangular matrix from the renumebred edges + edgelist_df = edgelist_df[ + edgelist_df[srcCol] + <= edgelist_df[dstCol] + ] + edgelist_df = edgelist_df.reset_index(drop=True) self.properties.edge_count = len(edgelist_df) - edgelist_df = edgelist_df.rename( - columns=self.renumber_map.internal_to_external_col_names - ) - # If there is no 'wgt' column, nothing will happen wgtCol = simpleGraphImpl.edgeWeightCol edgelist_df = edgelist_df.rename(columns={wgtCol: self.weight_column}) @@ -1196,7 +1209,6 @@ def edges(self): sources and destinations. It does not return the edge weights. For viewing edges with weights use view_edge_list() """ - # FIXME: The user needs to get the edges with original column names return self.view_edge_list()[self.vertex_columns] def nodes(self): diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 91b13c60490..f98b95c5afa 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -665,16 +665,32 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal for nm in self.implementation.col_names: mapping[nm] = nm + "_" + column_name col_names = list(mapping.values()) - # FIXME: instead of hardcoded value, it should be 'simpleGraphImpl.srcCol' - # but there is no way to retrieve it with the current API - if column_name in [self.renumbered_src_col_name, "src"]: - self.internal_to_external_col_names.update( - dict(zip(col_names, self.input_src_col_names)) - ) - elif column_name in [self.renumbered_dst_col_name, "dst"]: - self.internal_to_external_col_names.update( - dict(zip(col_names, self.input_dst_col_names)) - ) + + if isinstance(self.input_src_col_names, list): + input_src_col_names = self.input_src_col_names.copy() + input_dst_col_names = self.input_dst_col_names.copy() + else: + # Assuming the src and dst columns are of the same length + # if they are lists. + input_src_col_names = [input_src_col_names] + input_dst_col_names = [input_dst_col_names] + if not isinstance(col_names, list): + col_names = [col_names] + + # FIXME: instead of hardcoded value, it should be 'simpleGraphImpl.srcCol' + # but there is no way to retrieve it with the current API + if column_name in [self.renumbered_src_col_name, "src"]: + self.internal_to_external_col_names.update( + dict(zip(col_names, input_src_col_names)) + ) + elif column_name in [self.renumbered_dst_col_name, "dst"]: + self.internal_to_external_col_names.update( + dict(zip(col_names, input_dst_col_names)) + ) + + if len(self.implementation.col_names) == 1: + col_names = col_names[0] + if preserve_order: index_name = NumberMap.generate_unused_column_name(df) @@ -693,7 +709,9 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal df = df.map_partitions(lambda df: df.rename(columns=mapping, copy=False)) else: df = df.rename(columns=mapping, copy=False) - # FIXME: This parameter is not working as expected + # FIXME: This parameter is not working as expected as it oesn't return + # the unrenumbered column names: leverage 'self.internal_to_external_col_names' + # instead. if get_column_names: return df, col_names else: diff --git a/python/cugraph/cugraph/testing/utils.py b/python/cugraph/cugraph/testing/utils.py index 0dae17ed14e..6d58076e6fe 100644 --- a/python/cugraph/cugraph/testing/utils.py +++ b/python/cugraph/cugraph/testing/utils.py @@ -407,7 +407,7 @@ def compare_mst(mst_cugraph, mst_nx): pass # check total weight - cg_sum = edgelist_df["weights"].sum() + cg_sum = edgelist_df[mst_cugraph.weight_column].sum() nx_sum = mst_nx_df["weight"].sum() print(cg_sum) print(nx_sum) diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index a80c47662e2..3b645322353 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -62,8 +62,8 @@ def compare_graphs(nx_graph, cu_graph): edgelist_df = cu_graph.view_edge_list().reset_index(drop=True) df = cudf.DataFrame() - df["source"] = edgelist_df["src"] - df["target"] = edgelist_df["dst"] + df["source"] = edgelist_df["source"] + df["target"] = edgelist_df["target"] if len(edgelist_df.columns) > 2: df["weight"] = edgelist_df["weights"] cu_to_nx_graph = nx.from_pandas_edgelist( @@ -290,7 +290,7 @@ def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): # Test @pytest.mark.sg -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", [utils.DATASETS[0]]) def test_edges_for_Graph(graph_file): cu_M = utils.read_csv_file(graph_file) @@ -319,10 +319,10 @@ def test_edges_for_Graph(graph_file): edges.append([edge[1], edge[0]]) else: edges.append([edge[0], edge[1]]) - nx_edge_list = cudf.DataFrame(list(edges), columns=["src", "dst"]) + nx_edge_list = cudf.DataFrame(list(edges), columns=["0", "1"]) assert_frame_equal( - nx_edge_list.sort_values(by=["src", "dst"]).reset_index(drop=True), - cu_edge_list.sort_values(by=["src", "dst"]).reset_index(drop=True), + nx_edge_list.sort_values(by=["0", "1"]).reset_index(drop=True), + cu_edge_list.sort_values(by=["0", "1"]).reset_index(drop=True), check_dtype=False, ) @@ -344,7 +344,8 @@ def test_view_edge_list_for_Graph(graph_file): G = cugraph.from_cudf_edgelist( cu_M, source="0", destination="1", create_using=cugraph.Graph ) - cu_edge_list = G.view_edge_list().sort_values(["src", "dst"]) + + cu_edge_list = G.view_edge_list().sort_values(["0", "1"]) # Check if number of Edges is same assert len(nx_edges) == len(cu_edge_list) @@ -359,12 +360,12 @@ def test_view_edge_list_for_Graph(graph_file): edges.append([edge[0], edge[1]]) edges = list(edges) edges.sort() - nx_edge_list = cudf.DataFrame(edges, columns=["src", "dst"]) + nx_edge_list = cudf.DataFrame(edges, columns=["0", "1"]) # Compare nx and cugraph edges when viewing edgelist # assert cu_edge_list.equals(nx_edge_list) - assert (cu_edge_list["src"].to_numpy() == nx_edge_list["src"].to_numpy()).all() - assert (cu_edge_list["dst"].to_numpy() == nx_edge_list["dst"].to_numpy()).all() + assert (cu_edge_list["0"].to_numpy() == nx_edge_list["0"].to_numpy()).all() + assert (cu_edge_list["1"].to_numpy() == nx_edge_list["1"].to_numpy()).all() # Test @@ -682,8 +683,8 @@ def test_to_pandas_edgelist(graph_file): G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1") - assert "s" in G.to_pandas_edgelist("s", "d").columns - assert "s" in G.to_pandas_edgelist(source="s", destination="d").columns + assert "0" in G.to_pandas_edgelist("0", "1").columns + assert "0" in G.to_pandas_edgelist(source="0", destination="1").columns @pytest.mark.sg diff --git a/python/cugraph/cugraph/tests/structure/test_multigraph.py b/python/cugraph/cugraph/tests/structure/test_multigraph.py index af78c238d4e..a9ea617fdb8 100644 --- a/python/cugraph/cugraph/tests/structure/test_multigraph.py +++ b/python/cugraph/cugraph/tests/structure/test_multigraph.py @@ -47,7 +47,7 @@ def test_multigraph(graph_file): assert G.number_of_nodes() == Gnx.number_of_nodes() cuedges = cugraph.to_pandas_edgelist(G) cuedges.rename( - columns={"src": "source", "dst": "target", "weights": "weight"}, inplace=True + columns={"src": "source", "dst": "target", "wgt": "weight"}, inplace=True ) cuedges["weight"] = cuedges["weight"].round(decimals=3) nxedges = nx.to_pandas_edgelist(Gnx).astype( From d03d827c181099e149adf2047acefa94118e09cf Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 1 Aug 2023 08:35:57 -0700 Subject: [PATCH 12/35] fix style --- python/cugraph/cugraph/structure/graph_classes.py | 5 +++-- .../cugraph/structure/graph_implementation/simpleGraph.py | 7 ++----- python/cugraph/cugraph/structure/number_map.py | 1 - 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_classes.py b/python/cugraph/cugraph/structure/graph_classes.py index 622352a7d50..6f6c7e5a26c 100644 --- a/python/cugraph/cugraph/structure/graph_classes.py +++ b/python/cugraph/cugraph/structure/graph_classes.py @@ -72,9 +72,10 @@ def __init__(self, m_graph=None, directed=False): else: weights = None self.from_cudf_edgelist( - elist, source=m_graph.source_columns, + elist, + source=m_graph.source_columns, destination=m_graph.destination_columns, - edge_attr=weights + edge_attr=weights, ) else: raise TypeError( diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 2769b3a9c67..aa9a2bf5f6d 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -224,7 +224,7 @@ def __from_edgelist( self.weight_column = weight self.source_columns = source self.destination_columns = destination - + # Renumbering self.renumber_map = None self.store_transposed = store_transposed @@ -447,10 +447,7 @@ def view_edge_list(self): # is not consistant with the SG API. if not self.properties.directed: # Extract the upper triangular matrix from the renumebred edges - edgelist_df = edgelist_df[ - edgelist_df[srcCol] - <= edgelist_df[dstCol] - ] + edgelist_df = edgelist_df[edgelist_df[srcCol] <= edgelist_df[dstCol]] edgelist_df = edgelist_df.reset_index(drop=True) self.properties.edge_count = len(edgelist_df) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index f98b95c5afa..f648d28ba95 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -690,7 +690,6 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal if len(self.implementation.col_names) == 1: col_names = col_names[0] - if preserve_order: index_name = NumberMap.generate_unused_column_name(df) From 7f69c2234c3ccd251da070ea72128de640026114 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 05:27:23 -0700 Subject: [PATCH 13/35] remove invalid hardcoded column names, update docstrings, remove debug print --- .../structure/graph_implementation/simpleGraph.py | 7 +++---- python/cugraph/cugraph/structure/number_map.py | 13 ++++--------- python/cugraph/cugraph/tests/nx/test_nx_convert.py | 4 ++-- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index aa9a2bf5f6d..6b929981b52 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -47,8 +47,8 @@ class simpleGraphImpl: class EdgeList: def __init__( self, - source: str, - destination: str, + source: cudf.Series, + destination: cudf.Series, edge_attr: Union[cudf.DataFrame, Dict[str, cudf.DataFrame]] = None, ): self.edgelist_df = cudf.DataFrame() @@ -430,14 +430,13 @@ def view_edge_list(self): edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.dstCol ) - print("the mapping = ", self.renumber_map.internal_to_external_col_names) edgelist_df = edgelist_df.rename( columns=self.renumber_map.internal_to_external_col_names ) else: # When the graph is created from adjacency list, 'self.input_df', # 'self.source_columns' and 'self.destination_columns' are Nont - if self.input_df is not None: + if edgelist_df is None: edgelist_df = self.input_df if srcCol is None and dstCol is None: srcCol = simpleGraphImpl.srcCol diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index f648d28ba95..ed7e07bfce3 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -141,6 +141,8 @@ def __init__( self, ddf, src_col_names, dst_col_names, id_type, store_transposed ): self.col_names = NumberMap.compute_vals(src_col_names) + self.src_col_names = src_col_names + self.dst_col_names = dst_col_names self.val_types = NumberMap.compute_vals_types(ddf, src_col_names) self.val_types["count"] = np.int32 self.id_type = id_type @@ -515,13 +517,6 @@ def renumber_and_segment( renumber_map.input_src_col_names = src_col_names renumber_map.input_dst_col_names = dst_col_names if not isinstance(renumber_map.input_src_col_names, list): - """ - FIXME: Add mapping for multicolumn vertices. - renumber_map.internal_to_external_col_names = { - renumber_map.renumbered_src_col_name: renumber_map.input_src_col_names, - renumber_map.renumbered_dst_col_name: renumber_map.input_dst_col_names - } - """ src_col_names = [src_col_names] dst_col_names = [dst_col_names] @@ -672,8 +667,8 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal else: # Assuming the src and dst columns are of the same length # if they are lists. - input_src_col_names = [input_src_col_names] - input_dst_col_names = [input_dst_col_names] + input_src_col_names = [self.input_src_col_names] + input_dst_col_names = [self.input_dst_col_names] if not isinstance(col_names, list): col_names = [col_names] diff --git a/python/cugraph/cugraph/tests/nx/test_nx_convert.py b/python/cugraph/cugraph/tests/nx/test_nx_convert.py index 6455c611619..e20897572d0 100644 --- a/python/cugraph/cugraph/tests/nx/test_nx_convert.py +++ b/python/cugraph/cugraph/tests/nx/test_nx_convert.py @@ -25,6 +25,7 @@ def _compare_graphs(nxG, cuG, has_wt=True): assert nxG.number_of_edges() == cuG.number_of_edges() cu_df = cuG.view_edge_list().to_pandas() + cu_df = cu_df.rename(columns={"0": "src", "1": "dst"}) if has_wt is True: cu_df = cu_df.drop(columns=["weight"]) @@ -72,12 +73,11 @@ def test_networkx_compatibility(graph_file): # create a cuGraph Directed Graph gdf = cudf.from_pandas(M) - gdf = gdf.rename(columns={"weight": "weights"}) cuG = cugraph.from_cudf_edgelist( gdf, source="0", destination="1", - edge_attr="weights", + edge_attr="weight", create_using=cugraph.Graph(directed=True), ) From 1fb2422ee8d74071daebbf8be317ef8f729858bb Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 05:52:26 -0700 Subject: [PATCH 14/35] remove hardcoded column names --- .../structure/graph_implementation/simpleGraph.py | 2 ++ python/cugraph/cugraph/structure/number_map.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 6b929981b52..d89f9aa99bf 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -242,6 +242,8 @@ def __from_edgelist( # Use renumber_map to figure out if the python renumbering occured self.properties.renumbered = renumber_map.is_renumbered self.renumber_map = renumber_map + self.renumber_map.implementation.src_col_names = simpleGraphImpl.srcCol + self.renumber_map.implementation.src_col_names = simpleGraphImpl.dstCol else: if type(source) is list and type(destination) is list: raise ValueError("set renumber to True for multi column ids") diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index ed7e07bfce3..83aef848c20 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -25,6 +25,8 @@ class NumberMap: class SingleGPU: def __init__(self, df, src_col_names, dst_col_names, id_type, store_transposed): self.col_names = NumberMap.compute_vals(src_col_names) + # FIXME: rename the next two attributes to its singular conterpart as there + # is only one 'src' and 'dst' col name self.src_col_names = src_col_names self.dst_col_names = dst_col_names self.df = df @@ -525,6 +527,10 @@ def renumber_and_segment( # renumbered_dst_col_name) renumber_map.set_renumbered_col_names(src_col_names, dst_col_names, df.columns) + # FIXME: Remove 'src_col_names' and 'dst_col_names' from this initialization as + # those will capture 'simpleGraph.srcCol' and 'simpleGraph.dstCol'. + # In fact the input src and dst col names are already captured in + # 'renumber_map.input_src_col_names' and 'renumber_map.input_dst_col_names'. if isinstance(df, cudf.DataFrame): renumber_map.implementation = NumberMap.SingleGPU( df, @@ -674,11 +680,13 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal # FIXME: instead of hardcoded value, it should be 'simpleGraphImpl.srcCol' # but there is no way to retrieve it with the current API - if column_name in [self.renumbered_src_col_name, "src"]: + if column_name in [self.renumbered_src_col_name, + self.implementation.src_col_names]: self.internal_to_external_col_names.update( dict(zip(col_names, input_src_col_names)) ) - elif column_name in [self.renumbered_dst_col_name, "dst"]: + elif column_name in [self.renumbered_dst_col_name, + self.implementation.dst_col_names]: self.internal_to_external_col_names.update( dict(zip(col_names, input_dst_col_names)) ) From ccd8d17d4c942006194c0038014c6a6b976b7b4f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 05:53:03 -0700 Subject: [PATCH 15/35] fix style --- python/cugraph/cugraph/structure/number_map.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 83aef848c20..1abdced6cf5 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -680,13 +680,17 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal # FIXME: instead of hardcoded value, it should be 'simpleGraphImpl.srcCol' # but there is no way to retrieve it with the current API - if column_name in [self.renumbered_src_col_name, - self.implementation.src_col_names]: + if column_name in [ + self.renumbered_src_col_name, + self.implementation.src_col_names, + ]: self.internal_to_external_col_names.update( dict(zip(col_names, input_src_col_names)) ) - elif column_name in [self.renumbered_dst_col_name, - self.implementation.dst_col_names]: + elif column_name in [ + self.renumbered_dst_col_name, + self.implementation.dst_col_names, + ]: self.internal_to_external_col_names.update( dict(zip(col_names, input_dst_col_names)) ) From 1e9e670a6d1c657d24c98276a199421f19eba716 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 05:57:03 -0700 Subject: [PATCH 16/35] remove outdated fixme --- python/cugraph/cugraph/structure/number_map.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 1abdced6cf5..d7da20f9d84 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -678,8 +678,6 @@ def unrenumber(self, df, column_name, preserve_order=False, get_column_names=Fal if not isinstance(col_names, list): col_names = [col_names] - # FIXME: instead of hardcoded value, it should be 'simpleGraphImpl.srcCol' - # but there is no way to retrieve it with the current API if column_name in [ self.renumbered_src_col_name, self.implementation.src_col_names, From 17d4624511140548b7eaffb62482a3bdead62c43 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 07:24:36 -0700 Subject: [PATCH 17/35] fix typo --- .../cugraph/structure/graph_implementation/simpleGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index d89f9aa99bf..3be59f9a8fe 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -243,7 +243,7 @@ def __from_edgelist( self.properties.renumbered = renumber_map.is_renumbered self.renumber_map = renumber_map self.renumber_map.implementation.src_col_names = simpleGraphImpl.srcCol - self.renumber_map.implementation.src_col_names = simpleGraphImpl.dstCol + self.renumber_map.implementation.dst_col_names = simpleGraphImpl.dstCol else: if type(source) is list and type(destination) is list: raise ValueError("set renumber to True for multi column ids") From b206f33dd2e1592ba398a90ad5f599958341456c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 07:54:45 -0700 Subject: [PATCH 18/35] properly rename column names --- .../cugraph/structure/graph_implementation/simpleGraph.py | 7 ++++++- python/cugraph/cugraph/tests/structure/test_graph.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 3be59f9a8fe..ffe96c2dd66 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -437,12 +437,17 @@ def view_edge_list(self): ) else: # When the graph is created from adjacency list, 'self.input_df', - # 'self.source_columns' and 'self.destination_columns' are Nont + # 'self.source_columns' and 'self.destination_columns' are None if edgelist_df is None: edgelist_df = self.input_df if srcCol is None and dstCol is None: srcCol = simpleGraphImpl.srcCol dstCol = simpleGraphImpl.dstCol + elif not set(self.vertex_columns).issubset(set(edgelist_df.columns)): + # Get the original column names passed by the user. + edgelist_df = edgelist_df.rename( + columns={ + simpleGraphImpl.srcCol: srcCol, simpleGraphImpl.dstCol: dstCol}) # FIXME: When renumbered, the MG API uses renumbered col names which # is not consistant with the SG API. diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index 3b645322353..843ace768d1 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -290,7 +290,7 @@ def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): # Test @pytest.mark.sg -@pytest.mark.parametrize("graph_file", [utils.DATASETS[0]]) +@pytest.mark.parametrize("graph_file", utils.DATASETS) def test_edges_for_Graph(graph_file): cu_M = utils.read_csv_file(graph_file) From 97c6dd3790f9b235d25be147de7a94a8607370fc Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 07:55:39 -0700 Subject: [PATCH 19/35] fix style --- .../cugraph/structure/graph_implementation/simpleGraph.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index ffe96c2dd66..51d2a1b2c3c 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -447,7 +447,10 @@ def view_edge_list(self): # Get the original column names passed by the user. edgelist_df = edgelist_df.rename( columns={ - simpleGraphImpl.srcCol: srcCol, simpleGraphImpl.dstCol: dstCol}) + simpleGraphImpl.srcCol: srcCol, + simpleGraphImpl.dstCol: dstCol, + } + ) # FIXME: When renumbered, the MG API uses renumbered col names which # is not consistant with the SG API. From d6a8cca87074edb4407bcc5bfa71168f8fdcc063 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 08:01:11 -0700 Subject: [PATCH 20/35] add comments --- .../cugraph/structure/graph_implementation/simpleGraph.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 51d2a1b2c3c..ebfb048f93f 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -239,9 +239,10 @@ def __from_edgelist( ) source = renumber_map.renumbered_src_col_name destination = renumber_map.renumbered_dst_col_name - # Use renumber_map to figure out if the python renumbering occured + # Use renumber_map to figure out if the python renumbering occured. self.properties.renumbered = renumber_map.is_renumbered self.renumber_map = renumber_map + # Capture the internal column names in NumberMap. self.renumber_map.implementation.src_col_names = simpleGraphImpl.srcCol self.renumber_map.implementation.dst_col_names = simpleGraphImpl.dstCol else: From 86eb9b5982b1e817c9a60b49f18ef846b33ae193 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 08:03:47 -0700 Subject: [PATCH 21/35] fix typo --- python/cugraph/cugraph/tests/structure/test_graph_mg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py index 707b195dfa8..cd5ad00694f 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py +++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py @@ -338,7 +338,7 @@ def test_mg_select_random_vertices( assert len(join) == len(sampled_vertices) -@pytest.mark.sg +@pytest.mark.mg @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize( "edge_props", From 140ca33a8224fc87b90b5d087b3c9e7caea48269 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 10:13:08 -0700 Subject: [PATCH 22/35] add test for edge retrieval --- .../cugraph/tests/structure/test_graph.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index 843ace768d1..94ab199264e 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -878,3 +878,64 @@ def test_graph_creation_edge_properties(graph_file, edge_props): G = cugraph.Graph(directed=True) G.from_cudf_edgelist(df, source="0", destination="1", **prop_keys) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", [True, False]) +@pytest.mark.parametrize("renumber", [True, False]) +def test_graph_creation_edges(graph_file, directed, renumber): + # Verifies that the input dataframe passed the user is the same + # retrieved from the graph when the graph is directed + srcCol = "source" + dstCol = "target" + wgtCol = "weight" + input_df = cudf.read_csv( + graph_file, + delimiter=" ", + names=[srcCol, dstCol, wgtCol], + dtype=["int32", "int32", "float32"], + header=None, + ) + + G = cugraph.Graph(directed=directed) + + if renumber: + # trigger renumbering by passing a list of vertex column + srcCol = [srcCol] + dstCol = [dstCol] + vertexCol = srcCol + dstCol + else: + vertexCol = [srcCol, dstCol] + G.from_cudf_edgelist( + input_df, source=srcCol, destination=dstCol, edge_attr=wgtCol) + + columns = vertexCol.copy() + columns.append(wgtCol) + + edge_list_view = G.view_edge_list().loc[:, columns] + edges = G.edges().loc[:, vertexCol] + + assert_frame_equal( + edge_list_view.drop(columns=wgtCol).sort_values( + by=vertexCol).reset_index(drop=True), + edges.sort_values(by=vertexCol).reset_index(drop=True), + check_dtype=False, + ) + + if directed: + assert_frame_equal( + edge_list_view.sort_values(by=vertexCol).reset_index(drop=True), + input_df.sort_values(by=vertexCol).reset_index(drop=True), + check_dtype=False, + ) + else: + # If the graph is undirected, ensures that only the upper triangular + # matrix of the adjacency matrix is returned + if isinstance(srcCol, list): + srcCol = srcCol[0] + dstCol = dstCol[0] + is_upper_triangular = edge_list_view[srcCol] <= edge_list_view[dstCol] + is_upper_triangular = list(set(is_upper_triangular.values_host)) + assert len(is_upper_triangular) == 1 + assert is_upper_triangular[0] From 4a1e413d431e88f5102500e913a3d3557b4b7530 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 2 Aug 2023 10:16:38 -0700 Subject: [PATCH 23/35] fix style --- .../cugraph/cugraph/tests/structure/test_graph.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index 94ab199264e..0b92d2b6e32 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -899,7 +899,7 @@ def test_graph_creation_edges(graph_file, directed, renumber): ) G = cugraph.Graph(directed=directed) - + if renumber: # trigger renumbering by passing a list of vertex column srcCol = [srcCol] @@ -907,9 +907,8 @@ def test_graph_creation_edges(graph_file, directed, renumber): vertexCol = srcCol + dstCol else: vertexCol = [srcCol, dstCol] - G.from_cudf_edgelist( - input_df, source=srcCol, destination=dstCol, edge_attr=wgtCol) - + G.from_cudf_edgelist(input_df, source=srcCol, destination=dstCol, edge_attr=wgtCol) + columns = vertexCol.copy() columns.append(wgtCol) @@ -917,12 +916,13 @@ def test_graph_creation_edges(graph_file, directed, renumber): edges = G.edges().loc[:, vertexCol] assert_frame_equal( - edge_list_view.drop(columns=wgtCol).sort_values( - by=vertexCol).reset_index(drop=True), + edge_list_view.drop(columns=wgtCol) + .sort_values(by=vertexCol) + .reset_index(drop=True), edges.sort_values(by=vertexCol).reset_index(drop=True), check_dtype=False, ) - + if directed: assert_frame_equal( edge_list_view.sort_values(by=vertexCol).reset_index(drop=True), From 6aa880634187205d034b3b5a98c1311735d70e4e Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 06:26:19 -0700 Subject: [PATCH 24/35] fix the edge_list view --- .../simpleDistributedGraph.py | 2 +- .../graph_implementation/simpleGraph.py | 105 ++++++++++++------ 2 files changed, 75 insertions(+), 32 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index 2d8b652a071..24cfb1ac114 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -395,6 +395,7 @@ def view_edge_list(self): edgelist_df = self.input_df is_string_dtype = False + wgtCol = simpleDistributedGraphImpl.edgeWeightCol if not self.properties.directed: srcCol = self.source_columns dstCol = self.destination_columns @@ -411,7 +412,6 @@ def view_edge_list(self): ].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1) edgelist_df = edgelist_df.groupby(by=[srcCol, dstCol]).sum().reset_index() - wgtCol = simpleDistributedGraphImpl.edgeWeightCol if wgtCol in edgelist_df.columns: # FIXME: This breaks if there are are multi edges as those will # be dropped during the symmetrization step and the original 'weight' diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index ebfb048f93f..00649850186 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -219,7 +219,7 @@ def __from_edgelist( elist = input_df.compute().reset_index(drop=True) else: raise TypeError("input should be a cudf.DataFrame or a dask_cudf dataFrame") - # Original, unmodified input dataframe. + # initial, unmodified input dataframe. self.input_df = elist self.weight_column = weight self.source_columns = source @@ -239,10 +239,9 @@ def __from_edgelist( ) source = renumber_map.renumbered_src_col_name destination = renumber_map.renumbered_dst_col_name - # Use renumber_map to figure out if the python renumbering occured. + # Use renumber_map to figure out if the python renumbering occured self.properties.renumbered = renumber_map.is_renumbered self.renumber_map = renumber_map - # Capture the internal column names in NumberMap. self.renumber_map.implementation.src_col_names = simpleGraphImpl.srcCol self.renumber_map.implementation.dst_col_names = simpleGraphImpl.dstCol else: @@ -417,16 +416,68 @@ def view_edge_list(self): src, dst, weights = graph_primtypes_wrapper.view_edge_list(self) self.edgelist = self.EdgeList(src, dst, weights) - edgelist_df = self.edgelist.edgelist_df srcCol = self.source_columns dstCol = self.destination_columns + """ + Only use the initial input dataframe if the graph is directed with: + 1) single vertex column names with integer vertex type + 2) list of vertex column names of size 1 with integer vertex type + """ + use_initial_input_df = True + + if self.input_df is not None: + if type(srcCol) is list and type(dstCol) is list: + if len(srcCol) == 1: + srcCol = srcCol[0] + dstCol = dstCol[0] + if self.input_df[srcCol].dtype not in [np.int32, np.int64] or self.input_df[ + dstCol].dtype not in [np.int32, np.int64]: + # hypergraph case + use_initial_input_df = False + else: + use_initial_input_df = False + + elif self.input_df[srcCol].dtype not in [np.int32, np.int64] or self.input_df[ + dstCol + ].dtype not in [np.int32, np.int64]: + use_initial_input_df = False + else: + use_initial_input_df = False - if isinstance(srcCol, list) and len(srcCol) == 1: - srcCol = srcCol[0] - dstCol = dstCol[0] + if use_initial_input_df and self.properties.directed: + edgelist_df = self.input_df + else: + edgelist_df = self.edgelist.edgelist_df + if srcCol is None and dstCol is None: + srcCol = simpleGraphImpl.srcCol + dstCol = simpleGraphImpl.dstCol - # FIXME: Need to un-renumber if vertices are non integer or multi column - if self.properties.renumbered: + if use_initial_input_df and not self.properties.directed: + # unrenumber before extracting the upper triangular part + # case when the vertex column name is of size 1 + if self.properties.renumbered: + edgelist_df = self.renumber_map.unrenumber( + edgelist_df, simpleGraphImpl.srcCol + ) + edgelist_df = self.renumber_map.unrenumber( + edgelist_df, simpleGraphImpl.dstCol + ) + edgelist_df = edgelist_df.rename( + columns=self.renumber_map.internal_to_external_col_names + ) + # extract the upper triangular part + edgelist_df = edgelist_df[ + edgelist_df[srcCol] <= edgelist_df[dstCol]] + else: + edgelist_df = edgelist_df[ + edgelist_df[ + simpleGraphImpl.srcCol] <= edgelist_df[simpleGraphImpl.dstCol]] + elif not use_initial_input_df and self.properties.renumbered: + # Do not unrenumber the vertices if the initial input df was used + if not self.properties.directed: + edgelist_df = edgelist_df[ + edgelist_df[ + simpleGraphImpl.srcCol] <= edgelist_df[simpleGraphImpl.dstCol]] edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.srcCol ) @@ -436,37 +487,29 @@ def view_edge_list(self): edgelist_df = edgelist_df.rename( columns=self.renumber_map.internal_to_external_col_names ) - else: - # When the graph is created from adjacency list, 'self.input_df', - # 'self.source_columns' and 'self.destination_columns' are None - if edgelist_df is None: - edgelist_df = self.input_df - if srcCol is None and dstCol is None: - srcCol = simpleGraphImpl.srcCol - dstCol = simpleGraphImpl.dstCol - elif not set(self.vertex_columns).issubset(set(edgelist_df.columns)): - # Get the original column names passed by the user. - edgelist_df = edgelist_df.rename( - columns={ - simpleGraphImpl.srcCol: srcCol, - simpleGraphImpl.dstCol: dstCol, - } - ) + + if self.vertex_columns is not None and len(self.vertex_columns) == 2: + # single column vertices internally renamed to 'simpleGraphImpl.srcCol' + # and 'simpleGraphImpl.dstCol'. + if not set(self.vertex_columns).issubset(set(edgelist_df.columns)): + # Get the initial column names passed by the user. + if srcCol is not None and dstCol is not None: + edgelist_df = edgelist_df.rename( + columns={ + simpleGraphImpl.srcCol: srcCol, simpleGraphImpl.dstCol: dstCol}) # FIXME: When renumbered, the MG API uses renumbered col names which # is not consistant with the SG API. - if not self.properties.directed: - # Extract the upper triangular matrix from the renumebred edges - edgelist_df = edgelist_df[edgelist_df[srcCol] <= edgelist_df[dstCol]] - edgelist_df = edgelist_df.reset_index(drop=True) + self.properties.edge_count = len(edgelist_df) - # If there is no 'wgt' column, nothing will happen wgtCol = simpleGraphImpl.edgeWeightCol - edgelist_df = edgelist_df.rename(columns={wgtCol: self.weight_column}) + edgelist_df = edgelist_df.rename( + columns={wgtCol: self.weight_column}).reset_index(drop=True) return edgelist_df + def delete_edge_list(self): """ Delete the edge list. From 351dc45b0555895695a40207f30549b5c24db893 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 06:30:46 -0700 Subject: [PATCH 25/35] remove invalid hardcoded column names and add more tests --- .../tests/community/test_triangle_count_mg.py | 4 +- .../cugraph/tests/core/test_k_core_mg.py | 7 ++- .../cugraph/tests/structure/test_graph.py | 46 +++++++++++++++++++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py b/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py index 2cf0525d2ad..0f7bb14581f 100644 --- a/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py +++ b/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py @@ -69,8 +69,8 @@ def input_expected_output(dask_client, input_combo): if start_list: # sample k nodes from the cuGraph graph k = random.randint(1, 10) - srcs = G.view_edge_list()["src"] - dsts = G.view_edge_list()["dst"] + srcs = G.view_edge_list()[G.source_columns] + dsts = G.view_edge_list()[G.destination_columns] nodes = cudf.concat([srcs, dsts]).drop_duplicates() start_list = nodes.sample(k) else: diff --git a/python/cugraph/cugraph/tests/core/test_k_core_mg.py b/python/cugraph/cugraph/tests/core/test_k_core_mg.py index c68108ce241..163e0f1adc9 100644 --- a/python/cugraph/cugraph/tests/core/test_k_core_mg.py +++ b/python/cugraph/cugraph/tests/core/test_k_core_mg.py @@ -83,9 +83,12 @@ def input_expected_output(dask_client, input_combo): ) sg_k_core_results = sg_k_core_graph.view_edge_list() # FIXME: The result will come asymetric. Symmetrize the results + srcCol = sg_k_core_graph.source_columns + dstCol = sg_k_core_graph.destination_columns + wgtCol = sg_k_core_graph.weight_column sg_k_core_results = ( - symmetrize_df(sg_k_core_results, "src", "dst", "weights") - .sort_values(["src", "dst"]) + symmetrize_df(sg_k_core_results, srcCol, dstCol, wgtCol) + .sort_values([srcCol, dstCol]) .reset_index(drop=True) ) diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index 0b92d2b6e32..30520ab068a 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -939,3 +939,49 @@ def test_graph_creation_edges(graph_file, directed, renumber): is_upper_triangular = list(set(is_upper_triangular.values_host)) assert len(is_upper_triangular) == 1 assert is_upper_triangular[0] + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", [utils.DATASETS_SMALL[0]]) +@pytest.mark.parametrize("directed", [False]) +def test_graph_creation_edges_multi_col_vertices(graph_file, directed): + srcCol = ["src_0", "src_1"] + dstCol = ["dst_0", "dst_1"] + wgtCol = "weight" + vertexCol = srcCol + dstCol + columns = vertexCol.copy() + columns.append(wgtCol) + + input_df = cudf.read_csv( + graph_file, + delimiter=" ", + names=[srcCol[0], dstCol[0], wgtCol], + dtype=["int32", "int32", "float32"], + header=None, + ) + input_df["src_1"] = input_df["src_0"] + 1000 + input_df["dst_1"] = input_df["dst_0"] + 1000 + + G = cugraph.Graph(directed=directed) + G.from_cudf_edgelist(input_df, source=srcCol, destination=dstCol, edge_attr=wgtCol) + + input_df = input_df.loc[:, columns] + edge_list_view = G.view_edge_list().loc[:, columns] + print("\nedgelist view = \n", edge_list_view.head()) + print("\nedges = \n", G.edges().head()) + edges = G.edges().loc[:, vertexCol] + + assert_frame_equal( + edge_list_view.drop(columns=wgtCol) + .sort_values(by=vertexCol) + .reset_index(drop=True), + edges.sort_values(by=vertexCol).reset_index(drop=True), + check_dtype=False, + ) + + if directed: + assert_frame_equal( + edge_list_view.sort_values(by=vertexCol).reset_index(drop=True), + input_df.sort_values(by=vertexCol).reset_index(drop=True), + check_dtype=False, + ) From 35fa60debb72a55b64336299496f5db9adb7bb5a Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 06:31:42 -0700 Subject: [PATCH 26/35] fix style --- .../graph_implementation/simpleGraph.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 00649850186..2690ab88c13 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -430,16 +430,19 @@ def view_edge_list(self): if len(srcCol) == 1: srcCol = srcCol[0] dstCol = dstCol[0] - if self.input_df[srcCol].dtype not in [np.int32, np.int64] or self.input_df[ - dstCol].dtype not in [np.int32, np.int64]: + if self.input_df[srcCol].dtype not in [ + np.int32, + np.int64, + ] or self.input_df[dstCol].dtype not in [np.int32, np.int64]: # hypergraph case use_initial_input_df = False else: use_initial_input_df = False - - elif self.input_df[srcCol].dtype not in [np.int32, np.int64] or self.input_df[ - dstCol - ].dtype not in [np.int32, np.int64]: + + elif self.input_df[srcCol].dtype not in [ + np.int32, + np.int64, + ] or self.input_df[dstCol].dtype not in [np.int32, np.int64]: use_initial_input_df = False else: use_initial_input_df = False @@ -466,18 +469,19 @@ def view_edge_list(self): columns=self.renumber_map.internal_to_external_col_names ) # extract the upper triangular part - edgelist_df = edgelist_df[ - edgelist_df[srcCol] <= edgelist_df[dstCol]] + edgelist_df = edgelist_df[edgelist_df[srcCol] <= edgelist_df[dstCol]] else: edgelist_df = edgelist_df[ - edgelist_df[ - simpleGraphImpl.srcCol] <= edgelist_df[simpleGraphImpl.dstCol]] + edgelist_df[simpleGraphImpl.srcCol] + <= edgelist_df[simpleGraphImpl.dstCol] + ] elif not use_initial_input_df and self.properties.renumbered: # Do not unrenumber the vertices if the initial input df was used if not self.properties.directed: edgelist_df = edgelist_df[ - edgelist_df[ - simpleGraphImpl.srcCol] <= edgelist_df[simpleGraphImpl.dstCol]] + edgelist_df[simpleGraphImpl.srcCol] + <= edgelist_df[simpleGraphImpl.dstCol] + ] edgelist_df = self.renumber_map.unrenumber( edgelist_df, simpleGraphImpl.srcCol ) @@ -496,7 +500,10 @@ def view_edge_list(self): if srcCol is not None and dstCol is not None: edgelist_df = edgelist_df.rename( columns={ - simpleGraphImpl.srcCol: srcCol, simpleGraphImpl.dstCol: dstCol}) + simpleGraphImpl.srcCol: srcCol, + simpleGraphImpl.dstCol: dstCol, + } + ) # FIXME: When renumbered, the MG API uses renumbered col names which # is not consistant with the SG API. @@ -505,11 +512,11 @@ def view_edge_list(self): wgtCol = simpleGraphImpl.edgeWeightCol edgelist_df = edgelist_df.rename( - columns={wgtCol: self.weight_column}).reset_index(drop=True) + columns={wgtCol: self.weight_column} + ).reset_index(drop=True) return edgelist_df - def delete_edge_list(self): """ Delete the edge list. From 56a92c9856d33577a1216c72b7f7616b20e6b33b Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 07:48:06 -0700 Subject: [PATCH 27/35] remove hardcoded column names --- python/cugraph/cugraph/tests/core/test_k_core_mg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/core/test_k_core_mg.py b/python/cugraph/cugraph/tests/core/test_k_core_mg.py index 163e0f1adc9..926e4f2b4d8 100644 --- a/python/cugraph/cugraph/tests/core/test_k_core_mg.py +++ b/python/cugraph/cugraph/tests/core/test_k_core_mg.py @@ -147,7 +147,9 @@ def test_dask_k_core(dask_client, benchmark, input_expected_output): expected_k_core_results = input_expected_output["sg_k_core_results"] k_core_results = ( - k_core_results.compute().sort_values(["src", "dst"]).reset_index(drop=True) + k_core_results.compute().sort_values( + ["src", "dst"]).reset_index(drop=True).rename( + columns={"weights": "weight"}) ) assert_frame_equal( From 711a1fe7483d2b1b1cdf416fa11122b391518ad8 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 07:53:15 -0700 Subject: [PATCH 28/35] fix style --- python/cugraph/cugraph/tests/core/test_k_core_mg.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/tests/core/test_k_core_mg.py b/python/cugraph/cugraph/tests/core/test_k_core_mg.py index 926e4f2b4d8..7f4eeeb69d5 100644 --- a/python/cugraph/cugraph/tests/core/test_k_core_mg.py +++ b/python/cugraph/cugraph/tests/core/test_k_core_mg.py @@ -147,9 +147,10 @@ def test_dask_k_core(dask_client, benchmark, input_expected_output): expected_k_core_results = input_expected_output["sg_k_core_results"] k_core_results = ( - k_core_results.compute().sort_values( - ["src", "dst"]).reset_index(drop=True).rename( - columns={"weights": "weight"}) + k_core_results.compute() + .sort_values(["src", "dst"]) + .reset_index(drop=True) + .rename(columns={"weights": "weight"}) ) assert_frame_equal( From fad38139a0a66112b9400b6b2dae8a06b6a2ba57 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 12:12:18 -0700 Subject: [PATCH 29/35] unrenumber before extracting the upper triangular part --- .../structure/graph_implementation/simpleDistributedGraph.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index 24cfb1ac114..eb5b3cf2d13 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -407,6 +407,11 @@ def view_edge_list(self): srcCol = self.renumber_map.renumbered_src_col_name dstCol = self.renumber_map.renumbered_dst_col_name + if isinstance(srcCol, list) and len(srcCol) == 1: + srcCol = self.renumber_map.renumbered_src_col_name + dstCol = self.renumber_map.renumbered_dst_col_name + edgelist_df = self.edgelist.edgelist_df + edgelist_df[srcCol], edgelist_df[dstCol] = edgelist_df[ [srcCol, dstCol] ].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1) From b1736912e5cc669fcb3c9a98669a2e3fe3493dca Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 12:14:05 -0700 Subject: [PATCH 30/35] add test for mg graph properties --- .../cugraph/tests/structure/test_graph.py | 7 +-- .../cugraph/tests/structure/test_graph_mg.py | 44 +++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index 30520ab068a..de306309ca4 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -942,8 +942,8 @@ def test_graph_creation_edges(graph_file, directed, renumber): @pytest.mark.sg -@pytest.mark.parametrize("graph_file", [utils.DATASETS_SMALL[0]]) -@pytest.mark.parametrize("directed", [False]) +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", [True, False]) def test_graph_creation_edges_multi_col_vertices(graph_file, directed): srcCol = ["src_0", "src_1"] dstCol = ["dst_0", "dst_1"] @@ -967,8 +967,6 @@ def test_graph_creation_edges_multi_col_vertices(graph_file, directed): input_df = input_df.loc[:, columns] edge_list_view = G.view_edge_list().loc[:, columns] - print("\nedgelist view = \n", edge_list_view.head()) - print("\nedges = \n", G.edges().head()) edges = G.edges().loc[:, vertexCol] assert_frame_equal( @@ -978,7 +976,6 @@ def test_graph_creation_edges_multi_col_vertices(graph_file, directed): edges.sort_values(by=vertexCol).reset_index(drop=True), check_dtype=False, ) - if directed: assert_frame_equal( edge_list_view.sort_values(by=vertexCol).reset_index(drop=True), diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py index cd5ad00694f..0e47b89d7de 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py +++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py @@ -363,3 +363,47 @@ def test_graph_creation_edge_properties(dask_client, graph_file, edge_props): G = cugraph.Graph(directed=True) G.from_dask_cudf_edgelist(df, source="0", destination="1", **prop_keys) + + +@pytest.mark.parametrize("directed", [True, False]) +@pytest.mark.parametrize("renumber", [True, False]) +@pytest.mark.parametrize("graph_file", datasets) +def test_graph_creation_properties(dask_client, graph_file, directed, renumber): + srcCol = "src" + dstCol = "dst" + wgtCol = "wgt" + df = cudf.read_csv( + graph_file, + delimiter=" ", + names=[srcCol, dstCol, wgtCol], + dtype=["int32", "int32", "float32"], + header=None, + ) + ddf = dask_cudf.from_cudf(df, npartitions=2) + + if renumber: + # trigger renumbering by passing a list of vertex column + srcCol = [srcCol] + dstCol = [dstCol] + vertexCol = srcCol + dstCol + else: + vertexCol = [srcCol, dstCol] + + sG = cugraph.Graph(directed=directed) + mG = cugraph.Graph(directed=directed) + sG.from_cudf_edgelist( + df, source=srcCol, destination=dstCol, edge_attr=wgtCol) + mG.from_dask_cudf_edgelist( + ddf, source=srcCol, destination=dstCol, edge_attr=wgtCol) + + columns = vertexCol.copy() + columns.append(wgtCol) + + sG_edgelist_view = sG.view_edge_list().sort_values( + by=vertexCol).reset_index(drop=True).loc[:, columns] + mG_edgelist_view = mG.view_edge_list().compute().sort_values( + by=vertexCol).reset_index(drop=True).loc[:, columns] + + assert sG.number_of_nodes() == mG.number_of_nodes() + assert sG.number_of_edges() == mG.number_of_edges() + assert_frame_equal(sG_edgelist_view, mG_edgelist_view, check_dtype=False) From 7ab383e9da01977afcfc251b931566838a28a285 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 12:14:55 -0700 Subject: [PATCH 31/35] fix style --- .../cugraph/tests/structure/test_graph_mg.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py index 0e47b89d7de..3024e50402a 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py +++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py @@ -391,18 +391,25 @@ def test_graph_creation_properties(dask_client, graph_file, directed, renumber): sG = cugraph.Graph(directed=directed) mG = cugraph.Graph(directed=directed) - sG.from_cudf_edgelist( - df, source=srcCol, destination=dstCol, edge_attr=wgtCol) - mG.from_dask_cudf_edgelist( - ddf, source=srcCol, destination=dstCol, edge_attr=wgtCol) + sG.from_cudf_edgelist(df, source=srcCol, destination=dstCol, edge_attr=wgtCol) + mG.from_dask_cudf_edgelist(ddf, source=srcCol, destination=dstCol, edge_attr=wgtCol) columns = vertexCol.copy() columns.append(wgtCol) - - sG_edgelist_view = sG.view_edge_list().sort_values( - by=vertexCol).reset_index(drop=True).loc[:, columns] - mG_edgelist_view = mG.view_edge_list().compute().sort_values( - by=vertexCol).reset_index(drop=True).loc[:, columns] + + sG_edgelist_view = ( + sG.view_edge_list() + .sort_values(by=vertexCol) + .reset_index(drop=True) + .loc[:, columns] + ) + mG_edgelist_view = ( + mG.view_edge_list() + .compute() + .sort_values(by=vertexCol) + .reset_index(drop=True) + .loc[:, columns] + ) assert sG.number_of_nodes() == mG.number_of_nodes() assert sG.number_of_edges() == mG.number_of_edges() From e06a8108421e9ced8a40beecd6080e8eb22f86de Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 12:31:05 -0700 Subject: [PATCH 32/35] unrenumber before extracting the upper triangular part --- .../structure/graph_implementation/simpleDistributedGraph.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index eb5b3cf2d13..e84e87b6efd 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -411,6 +411,9 @@ def view_edge_list(self): srcCol = self.renumber_map.renumbered_src_col_name dstCol = self.renumber_map.renumbered_dst_col_name edgelist_df = self.edgelist.edgelist_df + # unrenumber before extracting the upper triangular part + edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) + edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) edgelist_df[srcCol], edgelist_df[dstCol] = edgelist_df[ [srcCol, dstCol] From de8b904d1f57278ac5aa05cb0f1b52e172a3acac Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 13:42:03 -0700 Subject: [PATCH 33/35] rename column name --- .../cugraph/tests/community/test_induced_subgraph_mg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py b/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py index 3a6a6e0d409..d93fa3b547d 100644 --- a/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py +++ b/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py @@ -90,8 +90,8 @@ def input_expected_output(input_combo): # Sample k vertices from the cuGraph graph # FIXME: Leverage the method 'select_random_vertices' instead - srcs = G.view_edge_list()["src"] - dsts = G.view_edge_list()["dst"] + srcs = G.view_edge_list()["0"] + dsts = G.view_edge_list()["1"] vertices = cudf.concat([srcs, dsts]).drop_duplicates() vertices = vertices.sample(num_seeds).astype("int32") From e586c38fcb83be2627d04615284905ad36ae39e2 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 16:06:58 -0700 Subject: [PATCH 34/35] add multicolumn support for MG when getting the edgelist view --- .../graph_implementation/simpleDistributedGraph.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index e84e87b6efd..097f3f37962 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -395,6 +395,7 @@ def view_edge_list(self): edgelist_df = self.input_df is_string_dtype = False + is_multi_column = False wgtCol = simpleDistributedGraphImpl.edgeWeightCol if not self.properties.directed: srcCol = self.source_columns @@ -407,13 +408,16 @@ def view_edge_list(self): srcCol = self.renumber_map.renumbered_src_col_name dstCol = self.renumber_map.renumbered_dst_col_name - if isinstance(srcCol, list) and len(srcCol) == 1: + if isinstance(srcCol, list): srcCol = self.renumber_map.renumbered_src_col_name dstCol = self.renumber_map.renumbered_dst_col_name edgelist_df = self.edgelist.edgelist_df # unrenumber before extracting the upper triangular part - edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) - edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) + if len(srcCol) == 1: + edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) + edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) + else: + is_multi_column = True edgelist_df[srcCol], edgelist_df[dstCol] = edgelist_df[ [srcCol, dstCol] @@ -426,7 +430,7 @@ def view_edge_list(self): # will be halved. edgelist_df[wgtCol] /= 2 - if is_string_dtype: + if is_string_dtype or is_multi_column: # unrenumber the vertices edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) From 00eab4453c12eb54ceebea4683827b4ff2bc1a82 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 3 Aug 2023 16:17:51 -0700 Subject: [PATCH 35/35] fix typo --- .../structure/graph_implementation/simpleDistributedGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index 097f3f37962..90db2c6b1f5 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -413,7 +413,7 @@ def view_edge_list(self): dstCol = self.renumber_map.renumbered_dst_col_name edgelist_df = self.edgelist.edgelist_df # unrenumber before extracting the upper triangular part - if len(srcCol) == 1: + if len(self.source_columns) == 1: edgelist_df = self.renumber_map.unrenumber(edgelist_df, srcCol) edgelist_df = self.renumber_map.unrenumber(edgelist_df, dstCol) else: